package bio.pih.lucene; import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import java.io.IOException; import java.util.NoSuchElementException; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.Field.Index; import org.apache.lucene.document.Field.Store; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.Term; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; import bio.pih.genoogle.io.reader.IOTools; import bio.pih.genoogle.io.reader.ParseException; import bio.pih.genoogle.io.reader.RichSequenceStreamReader; import bio.pih.genoogle.seq.DNAAlphabet; import bio.pih.genoogle.seq.IllegalSymbolException; import bio.pih.genoogle.seq.RichSequence; public class LuceneIndexer { private static final IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_31, new StandardAnalyzer(Version.LUCENE_31)); public final IndexWriter indexWriter; public LuceneIndexer(String databasePath, int databaseFragment) throws IOException { Directory indexDir = FSDirectory.open(new File(databasePath, Integer.toString(databaseFragment))); indexWriter = new IndexWriter(indexDir, indexWriterConfig); } private static int nextSequenceId = 0; public static void main(String[] args) throws IOException, IllegalSymbolException, NoSuchElementException, ParseException { Directory indexDir = FSDirectory.open(new File("./index")); if (new File("./index").exists()) { IndexSearcher is = new IndexSearcher(indexDir); Query q = new TermQuery(new Term("header", "100")); TopDocs search = is.search(q, 20); System.out.println(search.totalHits); System.out.println(search.scoreDocs[0]); } else { final boolean forceFormatting = true; IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_31, new StandardAnalyzer(Version.LUCENE_31)); IndexWriter indexWriter = new IndexWriter(indexDir, indexWriterConfig); BufferedReader is = new BufferedReader(new FileReader("/Users/albrecht/genoogle/files/fasta/ecoli.nt")); RichSequenceStreamReader readFastaDNA = IOTools.readFasta(is, DNAAlphabet.SINGLETON); while (readFastaDNA.hasNext()) { RichSequence s; try { s = readFastaDNA.nextRichSequence(); } catch (IllegalSymbolException e) { if (forceFormatting) { continue; } else { throw e; } } int id = getNextSequenceId(); String gi = s.getGi(); String name = s.getName(); String type = s.getType(); String accession = s.getAccession(); String description = s.getDescription(); String header = s.getHeader(); System.out.println(id); System.out.println(gi); System.out.println(name); System.out.println(type); System.out.println(accession); System.out.println(description); Document doc = new Document(); doc.add(new Field("header", header, Store.YES, Index.ANALYZED)); doc.add(new Field("gi", gi, Store.YES, Index.NOT_ANALYZED)); doc.add(new Field("name", name, Store.YES, Index.NOT_ANALYZED)); doc.add(new Field("type", type, Store.YES, Index.NOT_ANALYZED)); doc.add(new Field("accession", accession, Store.YES, Index.NOT_ANALYZED)); doc.add(new Field("description", description, Store.YES, Index.ANALYZED)); doc.add(new Field("id", Integer.toString(id), Store.YES, Index.NOT_ANALYZED)); doc.add(new Field("file", "ecoli.nt", Store.YES, Index.NOT_ANALYZED)); doc.add(new Field("db", "ECOLI_DB", Store.YES, Index.NOT_ANALYZED)); indexWriter.addDocument(doc); } indexWriter.optimize(); indexWriter.close(); } } protected static int getNextSequenceId() { int id = nextSequenceId; nextSequenceId++; return id; } }